# load packages and data
library(cicero)
library(data.table)
library(Matrix)
library(proxy)
library(reshape2)
library(umap)
path <- '~/Desktop/Buenrostro/count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
[1] 2034
#assuming tab separated values with a header
datalist <- lapply(files, function(x)fread(paste0(path,x))$V4)
dim(datafr)
[1] 237450 2034
df_regions = read.csv("~/Desktop/Buenrostro/combined.sorted.merged.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
[1] 237450 3
peaknames <- paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(peaknames)
[1] "chr1_10413_10625" "chr1_13380_13624" "chr1_16145_16354" "chr1_96388_96812" "chr1_115650_115812" "chr1_237625_237888"
head(sapply(strsplit(files,'\\.'),'[', 1))
[1] "BM1077-CLP-Frozen-160106-13" "BM1077-CLP-Frozen-160106-14" "BM1077-CLP-Frozen-160106-2" "BM1077-CLP-Frozen-160106-21" "BM1077-CLP-Frozen-160106-27"
[6] "BM1077-CLP-Frozen-160106-3"
colnames(datafr) <- sapply(strsplit(files,'\\.'),'[', 1)
rownames(datafr) <- peaknames
datafr[1:5,1:5]
BM1077-CLP-Frozen-160106-13 BM1077-CLP-Frozen-160106-14 BM1077-CLP-Frozen-160106-2 BM1077-CLP-Frozen-160106-21 BM1077-CLP-Frozen-160106-27
chr1_10413_10625 0 0 0 0 0
chr1_13380_13624 0 0 0 0 0
chr1_16145_16354 0 0 0 0 0
chr1_96388_96812 0 0 0 0 0
chr1_115650_115812 0 0 0 0 0
dim(datafr)
[1] 237450 2034
# Convert the data into a sparse matrix format.
datafr <- Matrix(datafr, sparse = TRUE)
# Binarize the matrix
datafr@x[datafr@x>0]<- 1
# Format cell info
cellinfo <- as.data.frame(colnames(datafr))
colnames(cellinfo) <- "cells"
rownames(cellinfo) <- cellinfo$cells
peakinfo <- as.data.frame(cbind(df_regions, peaknames))
colnames(peakinfo) <- c("chr", "bp1", "bp2", "site_name")
rownames(peakinfo) <- peaknames
print(datafr[1:10,1:10])
10 x 10 sparse Matrix of class "dgCMatrix"
[[ suppressing 10 column names 'BM1077-CLP-Frozen-160106-13', 'BM1077-CLP-Frozen-160106-14', 'BM1077-CLP-Frozen-160106-2' ... ]]
chr1_10413_10625 . . . . . . . . . .
chr1_13380_13624 . . . . . . . . . .
chr1_16145_16354 . . . . . . . . . .
chr1_96388_96812 . . . . . . . . . .
chr1_115650_115812 . . . . . . . . . .
chr1_237625_237888 . . . . . . . . . .
chr1_240909_241193 . . . . . . . . . .
chr1_521446_521676 . . . . . . . . . .
chr1_540558_541085 . . . . . . . . . .
chr1_713705_714611 . . 1 . . . . . . 1
print(cellinfo[1:10, ])
[1] "BM1077-CLP-Frozen-160106-13" "BM1077-CLP-Frozen-160106-14" "BM1077-CLP-Frozen-160106-2" "BM1077-CLP-Frozen-160106-21" "BM1077-CLP-Frozen-160106-27"
[6] "BM1077-CLP-Frozen-160106-3" "BM1077-CLP-Frozen-160106-36" "BM1077-CLP-Frozen-160106-42" "BM1077-CLP-Frozen-160106-44" "BM1077-CLP-Frozen-160106-50"
print(peakinfo[1:10, ])
# Make CDS
input_cds <- suppressWarnings(new_cell_data_set(datafr, cell_metadata = cellinfo, gene_metadata = peakinfo))
input_cds <- monocle3::detect_genes(input_cds)
#Ensure there are no peaks included with zero reads
input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0, ]
print(input_cds)
class: cell_data_set
dim: 237450 2034
metadata(1): cds_version
assays(1): counts
rownames(237450): chr1_10413_10625 chr1_13380_13624 ... chrY_59013930_59014161 chrY_59363205_59363360
rowData names(5): chr bp1 bp2 site_name num_cells_expressed
colnames(2034): BM1077-CLP-Frozen-160106-13 BM1077-CLP-Frozen-160106-14 ... singles-PB1022-mono-160128-95 singles-PB1022-mono-160128-96
colData names(3): cells Size_Factor num_genes_expressed
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
hist(Matrix::colSums(exprs(input_cds)), breaks = 20)

# Filter cells by peak_count
# Please set an appropriate threshold values according to your data
min_count <- 2000
max_count <- 15000
input_cds <- input_cds[,Matrix::colSums(exprs(input_cds)) >= min_count]
input_cds <- input_cds[,Matrix::colSums(exprs(input_cds)) <= max_count]
set.seed(2017)
input_cds <- detect_genes(input_cds)
input_cds <- estimate_size_factors(input_cds)
input_cds <- preprocess_cds(input_cds, method = "LSI")
input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', preprocess_method = "LSI")
plot_cells(input_cds)
No trajectory to plot. Has learn_graph() been called yet?
cluster not found in colData(cds), cells will not be colored
cluster_cells() has not been called yet, can't color cells by cluster

umap_coords<- reducedDims(input_cds)$UMAP
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
Overlap QC metrics:
Cells per bin: 50
Maximum shared cells bin-bin: 44
Mean shared cells bin-bin: 1.95752308701845
Median shared cells bin-bin: 0
chromosome_length <- read.table("~/Desktop/Buenrostro/hg19.chrom.sizes")
conns <- run_cicero(cicero_cds, chromosome_length) # Takes a few minutes to run
[1] "Starting Cicero"
[1] "Calculating distance_parameter value"
[1] "Running models"
[1] "Assembling connections"
[1] "Successful cicero models: 11397"
[1] "Other models: "
Zero or one element in range
1208
[1] "Models with errors: 0"
[1] "Done"
# Save results (Optional)
saveRDS(conns, paste0("~/Desktop/Buenrostro/res_Buenrostro2018_cicero_connections.Rds"))
# Check results
head(conns)
all_peaks <- row.names(exprs(input_cds))
write.csv(x = all_peaks, file = paste0("~/Desktop/Buenrostro/res_Buenrostro2018_all_peaks.csv"))
write.csv(x = conns, file = paste0("~/Desktop/Buenrostro/res_Buenrostro2018_cicero_connections.csv"))
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayB0byBlc3RpbWF0ZSB0aGUgY28tYWNjZXNzaWJsaXR5IG9mIHNjQVRBQy1zZXEgcGVha3Mgd2l0aCBDaWNlcm8iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQojIGxvYWQgcGFja2FnZXMgYW5kIGRhdGEKbGlicmFyeShjaWNlcm8pCmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeShNYXRyaXgpCmxpYnJhcnkocHJveHkpCmxpYnJhcnkocmVzaGFwZTIpCmxpYnJhcnkodW1hcCkKCnBhdGggIDwtICd+L0Rlc2t0b3AvQnVlbnJvc3Ryby9jb3VudF9yZWFkc19wZWFrc19vdXRwdXQvJwpmaWxlcyA8LSBsaXN0LmZpbGVzKHBhdGgscGF0dGVybiA9ICJcXC50eHQkIikKbGVuZ3RoKGZpbGVzKQpgYGAKCmBgYHtyfQojYXNzdW1pbmcgdGFiIHNlcGFyYXRlZCB2YWx1ZXMgd2l0aCBhIGhlYWRlciAgICAKZGF0YWxpc3QgIDwtIGxhcHBseShmaWxlcywgZnVuY3Rpb24oeClmcmVhZChwYXN0ZTAocGF0aCx4KSkkVjQpIAojYXNzdW1pbmcgdGhlIHNhbWUgaGVhZGVyL2NvbHVtbnMgZm9yIGFsbCBmaWxlcwpkYXRhZnIgICAgPC0gZG8uY2FsbCgiY2JpbmQiLCBkYXRhbGlzdCkgCmBgYAoKYGBge3J9CmRpbShkYXRhZnIpCmBgYAoKCmBgYHtyfQpkZl9yZWdpb25zICAgICAgPC0gcmVhZC5jc3YoIn4vRGVza3RvcC9CdWVucm9zdHJvL2NvbWJpbmVkLnNvcnRlZC5tZXJnZWQuYmVkIiwgc2VwID0gJ1x0JyxoZWFkZXI9RkFMU0Usc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSkKZGltKGRmX3JlZ2lvbnMpCmBgYAoKYGBge3J9CnBlYWtuYW1lcyAgICAgICA8LSBwYXN0ZShkZl9yZWdpb25zJFYxLGRmX3JlZ2lvbnMkVjIsZGZfcmVnaW9ucyRWMyxzZXAgPSAiXyIpCmhlYWQocGVha25hbWVzKQpoZWFkKHNhcHBseShzdHJzcGxpdChmaWxlcywnXFwuJyksJ1snLCAxKSkKCmNvbG5hbWVzKGRhdGFmcik8LSBzYXBwbHkoc3Ryc3BsaXQoZmlsZXMsJ1xcLicpLCdbJywgMSkKcm93bmFtZXMoZGF0YWZyKTwtIHBlYWtuYW1lcwpkYXRhZnJbMTo1LDE6NV0KYGBgCgpgYGB7cn0KZGltKGRhdGFmcikKYGBgCgoKYGBge3J9CiMgQ29udmVydCB0aGUgZGF0YSBpbnRvIGEgc3BhcnNlIG1hdHJpeCBmb3JtYXQuCmRhdGFmciAgICAgICAgICAgICAgPC0gTWF0cml4KGRhdGFmciwgc3BhcnNlID0gVFJVRSkKCiMgQmluYXJpemUgdGhlIG1hdHJpeApkYXRhZnJAeFtkYXRhZnJAeD4wXTwtIDEKCiMgRm9ybWF0IGNlbGwgaW5mbwpjZWxsaW5mbyAgICAgICAgICAgIDwtIGFzLmRhdGEuZnJhbWUoY29sbmFtZXMoZGF0YWZyKSkKY29sbmFtZXMoY2VsbGluZm8pICA8LSAiY2VsbHMiCnJvd25hbWVzKGNlbGxpbmZvKSAgPC0gY2VsbGluZm8kY2VsbHMKCnBlYWtpbmZvICAgICAgICAgICAgPC0gYXMuZGF0YS5mcmFtZShjYmluZChkZl9yZWdpb25zLCBwZWFrbmFtZXMpKQpjb2xuYW1lcyhwZWFraW5mbykgIDwtIGMoImNociIsICJicDEiLCAiYnAyIiwgInNpdGVfbmFtZSIpCnJvd25hbWVzKHBlYWtpbmZvKSAgPC0gcGVha25hbWVzCgpwcmludChkYXRhZnJbMToxMCwxOjEwXSkKcHJpbnQoY2VsbGluZm9bMToxMCwgXSkKcHJpbnQocGVha2luZm9bMToxMCwgXSkKYGBgCgoKYGBge3J9CiMgTWFrZSBDRFMKaW5wdXRfY2RzIDwtIHN1cHByZXNzV2FybmluZ3MobmV3X2NlbGxfZGF0YV9zZXQoZGF0YWZyLCBjZWxsX21ldGFkYXRhID0gY2VsbGluZm8sIGdlbmVfbWV0YWRhdGEgPSBwZWFraW5mbykpCmlucHV0X2NkcyA8LSBtb25vY2xlMzo6ZGV0ZWN0X2dlbmVzKGlucHV0X2NkcykKCiNFbnN1cmUgdGhlcmUgYXJlIG5vIHBlYWtzIGluY2x1ZGVkIHdpdGggemVybyByZWFkcwppbnB1dF9jZHMgPC0gaW5wdXRfY2RzW01hdHJpeDo6cm93U3VtcyhleHBycyhpbnB1dF9jZHMpKSAhPSAwLCBdIApwcmludChpbnB1dF9jZHMpCgpoaXN0KE1hdHJpeDo6Y29sU3VtcyhleHBycyhpbnB1dF9jZHMpKSwgYnJlYWtzID0gMjApCgojIEZpbHRlciBjZWxscyBieSBwZWFrX2NvdW50CiMgUGxlYXNlIHNldCBhbiBhcHByb3ByaWF0ZSB0aHJlc2hvbGQgdmFsdWVzIGFjY29yZGluZyB0byB5b3VyIGRhdGEgCm1pbl9jb3VudCA8LSAyMDAwCm1heF9jb3VudCA8LSAxNTAwMAppbnB1dF9jZHMgPC0gaW5wdXRfY2RzWyxNYXRyaXg6OmNvbFN1bXMoZXhwcnMoaW5wdXRfY2RzKSkgPj0gbWluX2NvdW50XSAKaW5wdXRfY2RzIDwtIGlucHV0X2Nkc1ssTWF0cml4Ojpjb2xTdW1zKGV4cHJzKGlucHV0X2NkcykpIDw9IG1heF9jb3VudF0gCgpzZXQuc2VlZCgyMDE3KQppbnB1dF9jZHMgPC0gZGV0ZWN0X2dlbmVzKGlucHV0X2NkcykKaW5wdXRfY2RzIDwtIGVzdGltYXRlX3NpemVfZmFjdG9ycyhpbnB1dF9jZHMpCmlucHV0X2NkcyA8LSBwcmVwcm9jZXNzX2NkcyhpbnB1dF9jZHMsIG1ldGhvZCA9ICJMU0kiKQppbnB1dF9jZHMgPC0gcmVkdWNlX2RpbWVuc2lvbihpbnB1dF9jZHMsIHJlZHVjdGlvbl9tZXRob2QgPSAnVU1BUCcsIHByZXByb2Nlc3NfbWV0aG9kID0gIkxTSSIpCgpwbG90X2NlbGxzKGlucHV0X2NkcykKYGBgCgoKYGBge3J9CnVtYXBfY29vcmRzPC0gcmVkdWNlZERpbXMoaW5wdXRfY2RzKSRVTUFQCmNpY2Vyb19jZHMgPC0gbWFrZV9jaWNlcm9fY2RzKGlucHV0X2NkcywgcmVkdWNlZF9jb29yZGluYXRlcyA9IHVtYXBfY29vcmRzKQpjaHJvbW9zb21lX2xlbmd0aCA8LSByZWFkLnRhYmxlKCJ+L0Rlc2t0b3AvQnVlbnJvc3Ryby9oZzE5LmNocm9tLnNpemVzIikKY29ubnMgICAgICA8LSBydW5fY2ljZXJvKGNpY2Vyb19jZHMsIGNocm9tb3NvbWVfbGVuZ3RoKSAjIFRha2VzIGEgZmV3IG1pbnV0ZXMgdG8gcnVuCgojIFNhdmUgcmVzdWx0cyAoT3B0aW9uYWwpCnNhdmVSRFMoY29ubnMsIHBhc3RlMCgifi9EZXNrdG9wL0J1ZW5yb3N0cm8vcmVzX0J1ZW5yb3N0cm8yMDE4X2NpY2Vyb19jb25uZWN0aW9ucy5SZHMiKSkKCiMgQ2hlY2sgcmVzdWx0cwpoZWFkKGNvbm5zKQoKYWxsX3BlYWtzICA8LSByb3cubmFtZXMoZXhwcnMoaW5wdXRfY2RzKSkKd3JpdGUuY3N2KHggPSBhbGxfcGVha3MsIGZpbGUgPSBwYXN0ZTAoIn4vRGVza3RvcC9CdWVucm9zdHJvL3Jlc19CdWVucm9zdHJvMjAxOF9hbGxfcGVha3MuY3N2IikpCndyaXRlLmNzdih4ID0gY29ubnMsIGZpbGUgPSBwYXN0ZTAoIn4vRGVza3RvcC9CdWVucm9zdHJvL3Jlc19CdWVucm9zdHJvMjAxOF9jaWNlcm9fY29ubmVjdGlvbnMuY3N2IikpCmBgYAoKCgoKCgoKCgoKCgoKCgoKCgoK